Motivation

I started watching David Robinson’s Tidy Tuesday screencasts to learn more about data wrangling and visualization (exploratory data analysis). I also do research in teaching and learning languages, so this was a great excuse to port his R code to Python and note differences between the two.

Week 29

An analysis of the 538 “College Major and Income” dataset from the #tidytuesday project.

David’s code

Dependencies

R

library(tidyverse)
library(scales)
library(ggrepel)
library(plotly)
theme_set(theme_light())

Python

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

pd.set_option('display.max_columns', 500)

Reading data

R

recent_grads <- read_csv("recent-grads.csv")
head(recent_grads)
## # A tibble: 6 x 21
##    Rank Major_code Major Total   Men Women Major_category ShareWomen Sample_size
##   <dbl>      <dbl> <chr> <dbl> <dbl> <dbl> <chr>               <dbl>       <dbl>
## 1     1       2419 PETR…  2339  2057   282 Engineering         0.121          36
## 2     2       2416 MINI…   756   679    77 Engineering         0.102           7
## 3     3       2415 META…   856   725   131 Engineering         0.153           3
## 4     4       2417 NAVA…  1258  1123   135 Engineering         0.107          16
## 5     5       2405 CHEM… 32260 21239 11021 Engineering         0.342         289
## 6     6       2418 NUCL…  2573  2200   373 Engineering         0.145          17
## # … with 12 more variables: Employed <dbl>, Full_time <dbl>, Part_time <dbl>,
## #   Full_time_year_round <dbl>, Unemployed <dbl>, Unemployment_rate <dbl>,
## #   Median <dbl>, P25th <dbl>, P75th <dbl>, College_jobs <dbl>,
## #   Non_college_jobs <dbl>, Low_wage_jobs <dbl>

Python

recent_grads = pd.read_csv("recent-grads.csv") # side note: entering direct raw github url resulted in some parser error.
recent_grads.head()
##    Rank  Major_code                                      Major    Total  \
## 0     1        2419                      PETROLEUM ENGINEERING   2339.0   
## 1     2        2416             MINING AND MINERAL ENGINEERING    756.0   
## 2     3        2415                  METALLURGICAL ENGINEERING    856.0   
## 3     4        2417  NAVAL ARCHITECTURE AND MARINE ENGINEERING   1258.0   
## 4     5        2405                       CHEMICAL ENGINEERING  32260.0   
## 
##        Men    Women Major_category  ShareWomen  Sample_size  Employed  \
## 0   2057.0    282.0    Engineering    0.120564           36      1976   
## 1    679.0     77.0    Engineering    0.101852            7       640   
## 2    725.0    131.0    Engineering    0.153037            3       648   
## 3   1123.0    135.0    Engineering    0.107313           16       758   
## 4  21239.0  11021.0    Engineering    0.341631          289     25694   
## 
##    Full_time  Part_time  Full_time_year_round  Unemployed  Unemployment_rate  \
## 0       1849        270                  1207          37           0.018381   
## 1        556        170                   388          85           0.117241   
## 2        558        133                   340          16           0.024096   
## 3       1069        150                   692          40           0.050125   
## 4      23170       5180                 16697        1672           0.061098   
## 
##    Median  P25th   P75th  College_jobs  Non_college_jobs  Low_wage_jobs  
## 0  110000  95000  125000          1534               364            193  
## 1   75000  55000   90000           350               257             50  
## 2   73000  50000  105000           456               176              0  
## 3   70000  43000   80000           529               102              0  
## 4   65000  50000   75000         18314              4440            972

Data cleaning

R

# cleaning step
major_processed <- recent_grads %>%
    arrange(desc(Median)) %>%
    # change all caps to words with first letter capitalized
    mutate(Major = str_to_title(Major), 
        Major = fct_reorder(Major, Median))
head(major_processed[c('Major', 'Median')], 15)
## # A tibble: 15 x 2
##    Major                                     Median
##    <fct>                                      <dbl>
##  1 Petroleum Engineering                     110000
##  2 Mining And Mineral Engineering             75000
##  3 Metallurgical Engineering                  73000
##  4 Naval Architecture And Marine Engineering  70000
##  5 Chemical Engineering                       65000
##  6 Nuclear Engineering                        65000
##  7 Actuarial Science                          62000
##  8 Astronomy And Astrophysics                 62000
##  9 Mechanical Engineering                     60000
## 10 Electrical Engineering                     60000
## 11 Computer Engineering                       60000
## 12 Aerospace Engineering                      60000
## 13 Biomedical Engineering                     60000
## 14 Materials Science                          60000
## 15 Engineering Mechanics Physics And Science  58000

Python

import string
major_processed = recent_grads.sort_values(by='Median', ascending=False)
# mutate(Major = str_to_title(Major), -> use pandas apply w. string.capwords
#         Major = fct_reorder(Major, Median)) -> just subset w. [ using -Median and index with the sorted indices
major_processed['Major'] = major_processed['Major'].apply(string.capwords)[(-major_processed['Median']).argsort()]
# Gotcha: argsort() will mess up row index order for tied Median, even setting the kind param of argsort didn't work
major_processed.sort_index(inplace = True)
major_processed[['Major', 'Median']].head(15)
##                                         Major  Median
## 0                       Petroleum Engineering  110000
## 1              Mining And Mineral Engineering   75000
## 2                   Metallurgical Engineering   73000
## 3   Naval Architecture And Marine Engineering   70000
## 4                        Chemical Engineering   65000
## 5                         Nuclear Engineering   65000
## 6                           Actuarial Science   62000
## 7                  Astronomy And Astrophysics   62000
## 8                      Mechanical Engineering   60000
## 9                      Electrical Engineering   60000
## 10                       Computer Engineering   60000
## 11                      Aerospace Engineering   60000
## 12                     Biomedical Engineering   60000
## 13                          Materials Science   60000
## 14  Engineering Mechanics Physics And Science   58000

Aggregating by category

R

# Common pattern: group_by -> summarize -> arrange
by_major_category <- major_processed %>%
    filter(!is.na(Total)) %>%
    group_by(Major_category) %>%
    # summarize is nice when you need to apply different functions to columns
    summarize(Men = sum(Men),
              Women = sum(Women),
              Total = sum(Total),
              # weight the medians by sample size
              MedianSalary = sum(Median * Sample_size) / sum(Sample_size)) %>%
    mutate(ShareWomen = Women / Total) %>%
    arrange(desc(ShareWomen))
by_major_category
## # A tibble: 16 x 6
##    Major_category                      Men  Women  Total MedianSalary ShareWomen
##    <chr>                             <dbl>  <dbl>  <dbl>        <dbl>      <dbl>
##  1 Health                            75517 387713 4.63e5       43694.      0.837
##  2 Education                        103526 455603 5.59e5       32364.      0.815
##  3 Psychology & Social Work          98115 382892 4.81e5       31234.      0.796
##  4 Interdisciplinary                  2817   9479 1.23e4       35000       0.771
##  5 Communications & Journalism      131921 260680 3.93e5       34738.      0.664
##  6 Arts                             134390 222740 3.57e5       32046.      0.624
##  7 Humanities & Liberal Arts        272846 440622 7.13e5       32192.      0.618
##  8 Biology & Life Science           184919 268943 4.54e5       34379.      0.593
##  9 Industrial Arts & Consumer Serv… 103781 126011 2.30e5       34186.      0.548
## 10 Social Science                   256834 273132 5.30e5       39200.      0.515
## 11 Law & Public Policy               91129  87978 1.79e5       35635.      0.491
## 12 Business                         667852 634524 1.30e6       40890.      0.487
## 13 Physical Sciences                 95390  90089 1.85e5       38477.      0.486
## 14 Agriculture & Natural Resources   40357  35263 7.56e4       35586.      0.466
## 15 Computers & Mathematics          208725  90283 2.99e5       46993.      0.302
## 16 Engineering                      408307 129276 5.38e5       55916.      0.240

Python

# R -> pandas challenge (help!):
# There might be a better way to do this, but can't figure out how MedianSalary and ShareWomen can be 
# declared within the agg() like you could with R's summerise

# Workaround (which is actually fewer lines):
# For now, setting with breaking things up into statements
by_major_category_grp = major_processed[major_processed.Total.notnull()].groupby('Major_category')
# Do the groupby / sum on the Men, Women, Total
by_major_category = by_major_category_grp['Men', 'Women', 'Total'].sum()
# Before creating the MedianSalary column.
by_major_category['MedianSalary'] =  by_major_category_grp.apply(lambda x: sum(x.Median * x.Sample_size) / sum(x.Sample_size))
# Transform ShareWomen
by_major_category['ShareWomen'] = by_major_category['Women'] / by_major_category['Total']
by_major_category.sort_values('ShareWomen', ascending=False, inplace=True)
by_major_category
##                                           Men     Women      Total  \
## Major_category                                                       
## Health                                75517.0  387713.0   463230.0   
## Education                            103526.0  455603.0   559129.0   
## Psychology & Social Work              98115.0  382892.0   481007.0   
## Interdisciplinary                      2817.0    9479.0    12296.0   
## Communications & Journalism          131921.0  260680.0   392601.0   
## Arts                                 134390.0  222740.0   357130.0   
## Humanities & Liberal Arts            272846.0  440622.0   713468.0   
## Biology & Life Science               184919.0  268943.0   453862.0   
## Industrial Arts & Consumer Services  103781.0  126011.0   229792.0   
## Social Science                       256834.0  273132.0   529966.0   
## Law & Public Policy                   91129.0   87978.0   179107.0   
## Business                             667852.0  634524.0  1302376.0   
## Physical Sciences                     95390.0   90089.0   185479.0   
## Agriculture & Natural Resources       40357.0   35263.0    75620.0   
## Computers & Mathematics              208725.0   90283.0   299008.0   
## Engineering                          408307.0  129276.0   537583.0   
## 
##                                      MedianSalary  ShareWomen  
## Major_category                                                 
## Health                               43693.740419    0.836977  
## Education                            32363.517503    0.814844  
## Psychology & Social Work             31234.402516    0.796022  
## Interdisciplinary                    35000.000000    0.770901  
## Communications & Journalism          34738.243123    0.663982  
## Arts                                 32045.858896    0.623694  
## Humanities & Liberal Arts            32192.322097    0.617578  
## Biology & Life Science               34378.549849    0.592566  
## Industrial Arts & Consumer Services  34185.588915    0.548370  
## Social Science                       39200.371098    0.515376  
## Law & Public Policy                  35635.142119    0.491204  
## Business                             40889.841986    0.487205  
## Physical Sciences                    38477.396658    0.485710  
## Agriculture & Natural Resources      35586.142322    0.466318  
## Computers & Mathematics              46993.426573    0.301942  
## Engineering                          55915.854649    0.240476

What categories of majors make more money than others?

R

# visualize the distribution of salary for each major using boxplot
major_processed %>%
    # reorder Major_category according to the Median (top -> highest salary)
    mutate(Major_category = fct_reorder(Major_category, Median)) %>%
    # Major_category as X, Median as y
    ggplot(aes(Major_category, Median, fill = Major_category)) +
    geom_boxplot() +
    # reformats the Median (y) as currency in $
    scale_y_continuous(labels = dollar_format()) +
    # since it's hard to read the labels for x axis, flip coords
    expand_limits(y = 0) +
    coord_flip() +
    theme(legend.position = "none")

Python

clear_plots_axes()
major_processed_dist_salary = major_processed.copy()
# reorder Major_category according to the Median (top -> highest salary)
# I'm sure there's a better way, but one way to replicate `fct_reorder` is to do a groupby -> agg -> sort_values
cat_order = major_processed_dist_salary.groupby('Major_category').agg('median').sort_values('Median', ascending=False).index.values.tolist()
# seaborn/matplotlib why are you like this? are there better ways?
_ = plt.xlim(0, major_processed_dist_salary['Median'].max() + 1000)
ax = sns.boxplot(
      x = 'Median',
      y = 'Major_category',
      data = major_processed_dist_salary,
      order = cat_order)
# limitations: there isn't an equivalent expand_limits so I had to use xlim and set the proper upper bound to include 
# the Engineering outlier
_ = plt.xlim(0, major_processed_dist_salary['Median'].max() + 1000)
# put $ in front of Median x-axis tick labels
_ = ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('${x:,.0f}'))
ax.grid(True)
plt.tight_layout()
plt.show()

What are the highest earning majors?

R

major_processed %>%
    filter(Sample_size >= 100) %>%
    head(20) %>%
    ggplot(aes(Major, Median, color=Major_category)) +
    geom_point() +
    # show intervals (range of salaries)
    geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
    # geom_point doesn't start at 0 whereas geom_col does
    # so need to expand scale to start from 0
    expand_limits(y = 0) +
    scale_y_continuous(labels = dollar_format()) +
    coord_flip() +
    labs(title = "What are the highest earning majors?",
        subtitle = "Top 20 majors with at least a 100 graduates surveyed. Bars represent the 25th to 75th percentile.",
        x = "",
        y = "Median salary of graduates")

Python

sns.set_style('whitegrid')
major_processed_highest_earning = major_processed.copy().query('Sample_size >= 100').head(20)
# problem: the majors on the x-axis is sorted differently in Python and the error bars don't look right
ax = sns.pointplot(
      y = 'Major', 
      x = 'Median',
      hue = 'Major_category',
      data = major_processed_highest_earning,
      dodge = False,
      join = False)
# point plot with expanded x-axis to include 0
_ = plt.xlim(0, major_processed_dist_salary['Median'].max() + 1000)
# put $ in front of Median x-axis tick labels
_ = ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('${x:,.0f}'))
# set the legend outside of the plot, slightly above the plot
_ = plt.legend(bbox_to_anchor = (1.8, .0), loc = 'lower center', borderaxespad = 4.)
# get the lower and upper 25th/75th percentile for error bar (not sure if this is the way) did the *.25 to make bars smaller
lower_quartile = major_processed_highest_earning.groupby('Major')['Median'].apply(np.percentile, 25).values*.25
upper_quartile = major_processed_highest_earning.groupby('Major')['Median'].apply(np.percentile, 75).values*.25
quartiles = [lower_quartile, upper_quartile]
# add error bar to each major 
# limitation: couldn't figure out how to make color be the hue (Major_category)! :(
# possible solution: https://stackoverflow.com/a/21915157
# another discrepancy: some of the 
_ = plt.errorbar(
      major_processed_highest_earning.Median, 
      major_processed_highest_earning.Major, 
      xerr = quartiles, capsize = 6, elinewidth = 2, linewidth = 2, fmt = 'none', ecolor='lightgrey')
# title
_ = ax.text(x = 0.5, y = 1.1, s = 'What are the highest earning majors?', fontsize = 20, weight = 'bold', ha = 'center', va = 'bottom', transform = ax.transAxes)
# subtitle
_ = ax.text(x = 0.5, y = 1.05, s = 'Top 20 majors with at least a 100 graduates surveyed. Bars represent the 25th to 75th percentile.', fontsize = 12, alpha = 0.75, ha = 'center', va = 'bottom', transform = ax.transAxes)
plt.tight_layout()
## /Library/Frameworks/Python.framework/Versions/3.6/bin/python3:1: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.
plt.show()

How does gender breakdown relate to typical earnings?

R

major_processed %>%
    arrange(desc(Total)) %>%
    head(20) %>%
    mutate(Major = fct_reorder(Major, Total)) %>%
    # `gather` collapses the two columns (Women, Men) into a single Gender and value is Number
    gather(Gender, Number, Women, Men) %>%
    ggplot(aes(Major, Number, fill = Gender)) +
    scale_y_continuous(labels = comma_format()) +
    geom_col() +
    coord_flip()

Python

sns.set_style("whitegrid")
major_processed_gender = major_processed.copy()
major_processed_gender['total'] = major_processed_gender.Men + major_processed_gender.Women
major_processed_gender = major_processed_gender.sort_values(by='Total', ascending = False).head(20)
cat_order = major_processed_gender.groupby('Major').agg('median').sort_values('Total', ascending=False).index.values.tolist()
_ = plt.xlim(0, major_processed_gender['total'].max() + 10000, 100000)
_ = ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
# this is a simple way to do stacked bar chart: barplot with total first
ax = sns.barplot(y = 'Major', x = 'total', data = major_processed_gender, order = cat_order, label = 'Men', color='#f8766d')
# then barplot with overlaying variable on top
ax = sns.barplot(y = 'Major', x = 'Women', label = 'Women', data = major_processed_gender, order = cat_order, color="#01bfc4")
# need to set legend and label since we didn't make use of the `hue` parameter to do our fill
_ = ax.legend(ncol = 2, frameon = False, fontsize = 'large')
_ = ax.set(xlabel="Number")
# plt.tight_layout()
plt.show()

Interactive line / scatter plot showing the ShareWomen by Median salary

R

g <- major_processed %>%
    mutate(Major_category = fct_lump(Major_category, 4)) %>%
    # the size gives a sense of what is a outlier or not
    ggplot(aes(ShareWomen, Median, color = Major_category, size = Sample_size, label = Major)) +
    geom_point() +
    scale_x_continuous(labels = percent_format()) +
    scale_y_continuous(labels = dollar_format()) +
    geom_smooth(aes(group = 1), method = "lm") +
    expand_limits(y = 0)
# interactive graph to show each one of the aes
ggplotly(g)

Python

Appendix

major_processed %>%
    filter(Sample_size >= 100) %>%
    ggplot(aes(Sample_size, Median)) +
    geom_point() +
    geom_text(aes(label = Major), check_overlap = TRUE, vjust = 1, hjust = 1) +
    scale_x_log10()

# this is so that the rest of the Rmd gets ignored
knitr::knit_exit()